# -*- coding: utf-8 -*-

import os, json
# from io import StringIO
# from pdfminer.converter import TextConverter
# from pdfminer.layout import LAParams
# from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from src.utils import mkdirs, cut

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

from gensim import summarization

def run(input_dir, output_dir, stopword_path):
	# read_docx(input_dir, output_dir)

	with open(stopword_path) as f:
		stopwords = [line.strip() for line in f.readlines()]

	mkdirs(output_dir)

	text_dicts = []

	# fns = os.listdir(input_dir)
	# for fn in range(5):
	# 	print('*******************************************************************')
	# 	print(fns[fn])

	# 	file_dict = { 'filename': fns[fn] }

	# 	fp = open(input_dir + fns[fn], 'rb')
	for fn in os.listdir(input_dir):
		print('*******************************************************************')
		print('filename:', fn)

		file_dict = { 'filename': fn }

		fp = open(input_dir + fn, 'rb')
		parser = PDFParser(fp)
		doc = PDFDocument()
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize('')

		resource = PDFResourceManager()
		laparams = LAParams()
		device = PDFPageAggregator(resource, laparams = laparams)
		interpreter = PDFPageInterpreter(resource, device)

		contents = []
		for page in doc.get_pages():
			interpreter.process_page(page)
			layout = device.get_result()

			pageid = layout.pageid
			if pageid == 1: file_dict.update(get_title(layout))
			if pageid > 2:
				for out in layout:
					if (hasattr(out, 'get_text')):
						line = trim(out.get_text())
						if line and not line == '-': contents.append(line)

		cut_contents = [' '.join(s) for s in [cut(con) for con in contents] if s]
		file_dict['summary'] = get_summary(cut_contents)
		# file_dict['content'] = [' '.join(s) for s in [cut(con, stopwords) for con in contents] if s]
		file_dict['content'] = contents

		# print()
		# print(file_dict)
		text_dicts.append(file_dict)

	with open(output_dir + 'texts.json', 'w', encoding = 'utf-8') as f:
		json.dump(text_dicts, f, sort_keys = False, indent = 2, ensure_ascii = False)


start_key = '项目名称'
middle_key = '课题名称'
end_key = '主管部门'

def get_title(layout):
	title_dict = {}

	# lock, title_texts = False, []
	# for out in layout:
	# 	if hasattr(out, 'get_text'):
	# 		line = trim(out.get_text())
	# 		if line:
	# 			if not lock and line.find(start_key) > -1: lock = True
	# 			if lock and line.find(end_key) > -1: lock = False
	# 			if lock: 
	# 				print(line)
	# 				title_texts.append(line)

	# pname, psubject = (''.join(title_texts).replace(start_key, '')
	# 					.replace(middle_key, '>>>')
	# 					.replace(':', '').replace('：', '').split('>>>'))

	# title_dict['name'] = pname
	# title_dict['subject'] = psubject

	page_content = []
	for out in layout:
		if hasattr(out, 'get_text'):
			line = trim(out.get_text())
			if line: page_content.append(line)

	page_text = ''.join(page_content)

	si = page_text.find(start_key)
	mi = page_text.find(middle_key)
	ei = page_text.find(end_key)

	pname = page_text[si:mi].replace(start_key, '').replace(':', '').replace('：', '')
	psubject = page_text[mi:ei].replace(middle_key, '').replace(':', '').replace('：', '')

	print('name:', pname)
	print('subject:', psubject)

	title_dict['name'], title_dict['subject'] = pname, psubject

	return title_dict

def get_content(layout):
	pass

def get_summary(contents):
	summ = summarization.summarize('. '.join(contents))
	# print(summ.replace('.', '').replace(' ', '').replace('\n', ''))
	return summ.replace('.', '').replace(' ', '').replace('\n', '')

def trim(txt):
	return txt.replace(' ', '').replace('\n', '').replace('\t', '')


	# for fn in os.listdir(input_dir):
	# 	print(fn)

	# 	fname = fn.replace('.pdf', '')
	# 	with open(input_dir + fn, 'rb') as f:
	# 		rsrcmgr = PDFResourceManager()
	# 		retstr = StringIO()
	# 		laparams = LAParams()

	# 		device = TextConverter(rsrcmgr, retstr, laparams = laparams)
	# 		process_pdf(rsrcmgr, device, f)
	# 		device.close()
	# 		content = retstr.getvalue()
	# 		retstr.close()

	# 		print((content))
			
			# print(content)
			# with open(output_dir + fname, 'w') as f:
			# 	f.write(content)

			# lines = str(content).split('\n')
			# with open(output_dir + fname, 'w') as f:
			# 	for l in lines:
			# 		l = l.strip().replace(' ', '')
			# 		if l:
			# 			f.write(l + '\n')

# from docx import Document

# '''
# 无法读取全部内容
# '''
# def read_docx(input_dir, output_dir):
# 	for f in os.listdir(input_dir):
# 		doc = Document(input_dir + f)
# 		content = [p.text for p in doc.paragraphs]
# 		print(content)
